2 Exploratory Analysis
2.1 Term Frequencies
This initial exploration of term frequencies allowed us to find some extraordinarily common words for this corpus that were added to the stop list. Not all common words were added to the stop list, however. Some common words may still have found relevance in determining certain groups of documents (an example would be the abbreviation ‘vs’ which indicates a comparison, most notably from an earnings report.)
Documents per word
bin = weightBin(tdm)
df=data.frame(docFreqs = row_sums(bin))
g1 = ggplot(df,aes(x=docFreqs)) +
geom_histogram(aes(y=..density..), alpha=0.5) + geom_density( alpha = 0.2) +
labs(x = "Number of Documents in which \n a Word Appears")
df=data.frame(docFreqs=df$docFreqs[df$docFreqs<100] )
g2 = ggplot(df,aes(x=docFreqs)) +
geom_histogram(aes(y=..density..), alpha=0.5) + geom_density( alpha = 0.2) +
labs(x = "Number of Documents in which \n a Word Appears", y='Frequency',
title='Same Distribution Cut at x=100')
grid.arrange(g1,g2,ncol=2)
TF-IDF per word
tfi = weightTfIdf(tdm)
df = data.frame(termFreqs = row_sums(tfi))
g1 = ggplot(df,aes(x=termFreqs)) +
geom_histogram(aes(y=..density..), alpha=0.5) + geom_density( alpha = 0.2) +
labs(x = "Sum of TF-IDF Weights \n for each words")
df=data.frame(termFreqs=df$termFreqs[df$termFreqs<100] )
g2 = ggplot(df,aes(x=termFreqs)) +
geom_histogram(aes(y=..density..), alpha=0.5) + geom_density( alpha = 0.2) +
labs(x = "Sum of TF-IDF Weights \n for each word", y='Frequency',
title='Same Distribution Cut at x=100')
grid.arrange(g1,g2,ncol=2)
## via SVD
# tfidf_tdm = weightTfIdf(tdm, normalize=T)
# m = Matrix::sparseMatrix(i=tfidf_tdm$i,
# j=tfidf_tdm$j,
# x=tfidf_tdm$v,
# dims=c(tfidf_tdm$nrow, tfidf_tdm$ncol),
# dimnames = tfidf_tdm$dimnames)
# svd = irlba(m, 150)
# save(svd,file='svd.RData')
load('docs/final_data_plots/svd.RData')
df = data.frame(x=1:150,d=svd$d)
g1 = ggplot(data=df, aes(x=x, y=d, group=1)) +
geom_line(color="red")+labs(y='Singular Values',x='index',
title='Screeplot of Reuters tf-idf Matrix, vlines at 10, 25') +
geom_point() +
geom_vline(xintercept = 25, linetype="dotted", color = "blue", size=1) +
geom_vline(xintercept = 10, linetype="dotted", color = "blue", size=1)
u.df = data.frame(x=svd$v[,1], y=svd$v[,2])
g2 = ggplot(data=u.df, aes(x=x, y=y)) +
geom_point()+labs(y='Second Singular Component',x='First Singular Component',
title='SVD Projection of Reuters tf-idf Term-Document Matrix')
g1
fig <- plot_ly(type = 'scatter', mode = 'markers')
fig <- fig %>%
add_trace(
x = svd$v[,1],
y = svd$v[,2],
text = ~paste('heading:', head ,"$<br>text: ", raw_text ),
hoverinfo = 'text',
marker = list(color='green', opacity=0.6),
showlegend = F
)
fig